Bayesian Stat

Ch2

Monte Carlo Error

  • Monte Carlo simulations can estimate the posterior statistics under the normal distribution θ ~ N(0,1); Monte Carlo error decreases with the increase of sample size n (蒙特卡洛(Monte Carlo)模拟可以估计正态分布 θ∼N(0,1) 下的后验统计量;蒙特卡洛误差随着样本量 n 的增加而减小)。
a <- c(1, 2)
b <- c(3, 4)
rbind(a,b)
  [,1] [,2]
a    1    2
b    3    4
c(rbind(a, b))
[1] 1 3 2 4
cbind(a,b)
     a b
[1,] 1 3
[2,] 2 4
# Define sample sizes
n_values <- c(10, 100, 1000, 10000, 100000)

# Number of repetitions for each n
reps <- 3

# Initialize storage for results
results <- matrix(NA, nrow = length(n_values), ncol = 2 * reps)

# Loop over different values of n
for (i in 1:length(n_values)) {
  n <- n_values[i]
  
  # 初始化操作--创建一个长度为 reps 的数值向量,并初始化所有元素为 0。
  means <- numeric(reps) 
  sds <- numeric(reps)
  
  # Repeat simulation for each repetition
  for (j in 1:reps) {
    sample <- rnorm(n)  # Generate n samples from N(0,1)
    means[j] <- mean(sample)  # Compute sample mean
    sds[j] <- sd(sample)  # Compute sample standard deviation
  }
  
  
  # Store results 
  results[i, ] <- c(rbind(means, sds))
}

# Convert results into a data frame
results_df <- as.data.frame(results)
colnames(results_df) <- c("Mean_1", "SD_1", "Mean_2", "SD_2", "Mean_3", "SD_3")
rownames(results_df) <- n_values

# Print results
print(results_df)
             Mean_1      SD_1       Mean_2      SD_2       Mean_3      SD_3
10    -0.2445700288 1.1550644 -0.091085100 1.0482625 -0.260854860 1.0028421
100    0.0860550662 0.9535699  0.030139854 0.9901484 -0.031680833 0.8784368
1000  -0.0279788512 1.0112929 -0.017108535 1.0568201 -0.021729665 0.9758514
10000 -0.0153251513 1.0126919  0.002784212 0.9935932 -0.004934942 0.9982627
1e+05 -0.0002515695 0.9970474 -0.003181742 0.9967342  0.001247449 0.9992713
# Compute Monte Carlo errors for repetition 1
# sapply() is one of the apply() series of functions that apply some function to each element of a list or vector and return the vector.
mc_errors <- sapply(1:length(n_values), function(i) {sd(rnorm(n_values[i])) / sqrt(n_values[i])})  

# Print Monte Carlo errors
print(mc_errors)
[1] 0.285541025 0.106054832 0.030772116 0.010018110 0.003164834

Direct Sampling

Uniform Generates Everything-1 (Uniform生万物)

\(F(x)=P(X\leq x)\), generate U~U[0,1], let X=\(F^{-1}(U)\)

Proof

Let X=\(F^{-1}(U)\)

\(P(X\leq x) = P(F^{-1}(U)\leq x)\) F is cdf–F is strictly monotonic, increasing and continuous function of x hence \(P(F^{-1}(U)\leq x)=P(F(F{-1}(U)\leq F(x))\) i.e. \(P(U\leq F(x))\). As U is a U[0,1] random variable, so that \(P(U\leq F(x))\) = F(x) since \(P(U\leq x)\) is the cdf of U~[0,1] i.e. \(P(F^{-1}(U)) \leq x)=F(x)\)\(P(X\leq x) = F(x)\) since we have let X=\(F^{-1}(U)\).

So, each PDF could be represented as a function of U where U~[0,1].

Rejection Sampling

Uniform Generates Everything-2 (Uniform生万物)

Simulate \(\theta\) Y~U0,g\(\theta\) –accepting rule of Y

1d eg

Background picture:

f = function(x){
  return(exp(-x^2))
}


# numeric calculation(Riemann sum)
dx=0.01
x=seq(-5,5,dx)
f.x=f(x)
sum(f.x*dx)
[1] 1.772454
sqrt(pi)
[1] 1.772454
# Rejection Sampling to estimate f(x)= exp(-x^2)
# Uniform(-5,5) not bounded so we choose c* it later.
g.x = runif(10000, -5, 5)

peak_f <- f(0) #- standard normal

# 计算均匀分布的峰值
peak_g_uniform <- 1 / (5 - (-5))

c = peak_f / peak_g_uniform # how much (the highest point of) f.x is bigger than g.x

# g.x在-5到5间
t = f(g.x) / (c * dunif(g.x, -5, 5))
u = runif(10000) # Y-axis distribution(since 0-1, uniform!)
?runif
indices = (u < t)
hist(g.x[indices])

ratio = sum(indices) / length(indices)
integral.f = ratio * c
integral.f
[1] 1.817
# Normal Distribution
g.x = rnorm(1000)

# calculate c to bound totally
g <- function(x) {
  return(dnorm(x))
}

# 计算 f(x) 和 g(x) 在 x = 0 处的值
f_0 <- f(0)
g_0 <- g(0)

# 计算 c 的值
c <- f_0 / g_0

# 输出 c 的值
c # c=2.51
[1] 2.506628
c = 2.55 #稍微取大点
t = f(g.x) / (c * dnorm(g.x))
u = runif(1000) # Y-axis distribution(since 0-1, uniform!)
indices = (u < t)
hist(g.x[indices])

ratio = sum(indices) / length(indices)
integral.f = ratio * c
integral.f
[1] 1.8207

Importance Sampling

eg.

g.x = rnorm(100000)
w = f(g.x) / dnorm(g.x)
mean(w)
[1] 1.768807
g.x = runif(100000, -5, 5)
w = f(g.x) / dunif(g.x, -5, 5)
mean(w)
[1] 1.748921

SIR(Sampling Importance Resampling)

这个过程在统计学中被称为重要性重采样或采样重要性重采样(SIR),是一种模拟从复杂分布中抽样的方法。 在SIR方法中,原始样本集合中的某些样本可能会因为具有较高的概率权重而被多次抽取,而其他样本则可能很少或根本不被抽取。这种方法可以用来近似目标分布,特别是当目标分布难以直接抽样时。

# Sampling Importance Resampling - SIR
# f(x) - desired (unnormalized) distribution - 5*chi-square(df=5)
# g(x) - proposed distribution - exponential
n = 1000000 #sample size
# 从提议分布中生成样本
x_exp_sample = rexp(n) # Samples are generated from the exponential distribution as a proposed distribution.
#  绘制提议分布的直方图
par(mfrow=c(1,1)) # 1 row 1 column
hist(x_exp_sample) 

# 计算目标分布的未归一化概率密度
f_x_unnormalized_pdf = 5*dchisq(x_exp_sample, df=5) # dchisq 函数返回的是概率密度值,而不是概率值(即未归一化的概率值),所以这里乘以5(或其他常数)是为了得到未归一化的卡方分布的概率密度值。f_x_unnormalized_pdf存储了每个指数分布样本点在自由度为5的卡方分布下的未归一化概率密度值

#  计算提议分布的概率密度
g_x_pdf = dexp(x_exp_sample) # 计算提议分布(指数分布)的概率密度函数。

#权重
weights = f_x_unnormalized_pdf / g_x_pdf # 计算每个样本的权重
#归一化权重
weights_norm = weights / sum(weights) # 归一化权重
# resampling
f_x_sample = sample(x_exp_sample, size = n, replace = TRUE, prob = weights_norm) 

# 绘制目标分布和重采样结果的直方图(对比真实和resampling)
par(mfrow=c(1,2))
x = rchisq(n, df=5)
hist(x[x<15], breaks = seq(0,15,1), main="Chi-Square")
hist(f_x_sample, breaks = seq(0,15,1), main="SIR")

# moving from f1(x) to f2(x)从卡方分布转移到 F 分布
f_1 = dchisq(f_x_sample, df=5)
f_2 = df(f_x_sample, df1=5, df2=25)
new_weights = f_2 / f_1

#检查权重均值
mean(new_weights) # should be ~ 1
[1] 1.000189
mean(weights)
[1] 5.006193
# 归一化新权重
new_weights_norm = new_weights / sum(new_weights)

#  重采样生成 F 分布样本

f2_x_sample = sample(f_x_sample, size = n, replace = TRUE, prob = new_weights_norm)

# 绘制 F 分布和重采样结果的直方图
par(mfrow=c(1,2))
x = rf(n, 5, 25)
hist(x[x<15], breaks = seq(0,15,1), main="F(5,25)")
hist(f2_x_sample, breaks = seq(0,15,1), main="SIR")

MCMC

  • trace plot

We choose N(0.5,\(\sigma\)) as the proposal distribution

上图中,右侧是每次迭代输入的theta的值,左侧是每个theta出现的次数, and if \(\theta_t\)~ N(\(\theta_{t-1}\), \(\sigma\))—-

but why choose to depend on the previous..

Metropolis-Hastings

acceptance probability= \(\alpha (\theta_{new},\theta_{t-1})\)= min[r(\(\theta_{new},\theta_{t-1}\)),1], where r is the ratio of the new parameter’s and the t-1 parameter’s posterior

then draw u~uniform(0,1) and if u is less than \(\alpha\) then \(\theta_t=\theta_{new}\) otherwise \(\theta_t=\theta_{t-1}\)

# 设置随机种子以确保结果可复现
set.seed(123)

# 定义后验分布的密度函数,beta-- prior,binomial-- likelihood(实际上,后验分布应该是Beta(alpha + k, beta + n - k),因为Beta分布是二项分布的共轭先验,但此处为了联系MH采样方法解决此问题)

posterior_density <- function(theta, alpha, beta, n, k) {
  if (theta < 0 || theta > 1) return(0)  # 确保theta在[0, 1]范围内
  return(dbeta(theta, shape1 = alpha, shape2 = beta) * dbinom(k, size = n, prob = theta))
}


# 计算接受概率--在MH算法中,接受概率应该是 (后验(candidate) * proposal(current | candidate)) / (后验(current) * proposal(candidate | current))。但这里使用的是对称的提议分布(正态分布),所以提议密度的比值是1,这时候接受率可以简化为后验的比值。

# Metropolis-Hastings算法(添加接受率计算)
metropolis_hastings <- function(posterior, proposal_sd, initial_value, iterations, alpha, beta, n, k) {
  theta <- numeric(iterations)
  theta[1] <- initial_value
  accept <- 0  # 接受计数器
  
  for (t in 2:iterations) {
    # 提出新的候选值
    candidate <- rnorm(1, mean = theta[t-1], sd = proposal_sd)
    
    # 计算接受概率(由于提议分布对称,转移概率比率为1)
    acceptance_ratio <- min(1, posterior_density(candidate, alpha, beta, n, k) / 
                             posterior_density(theta[t-1], alpha, beta, n, k))
    
    # 生成均匀分布的随机数
    u <- runif(1)
    
    # 更新状态
    if (u < acceptance_ratio) {
      theta[t] <- candidate
      accept <- accept + 1
    } else {
      theta[t] <- theta[t-1]
    }
  }
  
  # 计算接受率
  accept_rate <- accept / (iterations - 1)
  return(list(samples = theta, accept_rate = accept_rate))
}

# 参数设置
alpha <- 1   # Beta先验参数
beta <- 1
n <- 10      # 试验次数
k <- 4       # 成功次数
proposal_sd <- 0.3  # 调整提议分布标准差(影响接受率)
initial_value <- 0.2
iterations <- 10000

# 运行MCMC
result <- metropolis_hastings(posterior_density, proposal_sd, initial_value, iterations, alpha, beta, n, k)
samples <- result$samples
cat("接受率:", result$accept_rate, "\n")
接受率: 0.4835484 
# 后处理参数
burn_in <- 2000     # 去除前2000次迭代
thin <- 5           # 每5次抽样保留1次

# 后处理
samples_post_burn <- samples[(burn_in + 1):iterations]
thinned_samples <- samples_post_burn[seq(1, length(samples_post_burn), by = thin)]

# 绘制诊断图形


# 1. 原始链的轨迹图
plot(samples, type = "l", col = "grey", 
     main = "origin Markov trace plot", 
     xlab = "iterations", ylab = "theta")

# 绘制原始链的轨迹图的密度图
density_samples <- density(samples)
plot(density_samples, main = "Density of MCMC Samples", xlab = "Theta", ylab = "Density")

# 2. 处理后链的轨迹图
plot(thinned_samples, type = "l", col = "skyblue",
     main = paste("the processed trace plot (burn-in =", burn_in, ", thin =", thin, ")"), 
     xlab = "sampling times", ylab = "theta")

density_proposed_samples <- density(thinned_samples)
plot(density_proposed_samples, main = "Density of MCMC thinning Samples", xlab = "Theta", ylab = "Density")

# 3. 自相关图
acf(thinned_samples, main = "The processed autocorrelation graph", 
    col = "darkred", lag.max = 40)

# 4. 后验分布直方图与真实后验对比 通过比较直方图和理论后验分布,可以评估MCMC采样是否成功地从真实的后验分布中抽取样本。
hist(thinned_samples, breaks = 30, probability = TRUE, 
     main = "Posterior distribution comparison", xlab = "theta", col = "lightgreen")
curve(dbeta(x, shape1 = alpha + k, shape2 = beta + n - k),  # 真实后验
      add = TRUE, col = "darkblue", lwd = 2, lty = 2)
legend("topright", legend = c("MCMC samples", "real posterior distribution"),
       col = c("lightgreen", "darkblue"), lty = c(1, 2), lwd = c(5, 2),
       cex =0.3)

暂不看哈Monte Carlo Simulation (Crash Course on Monte Carlo Simulation from Very Normal vlogger)

eg of t-tests

results = logical(10000)
for (i in 1:10000){
  # Student's t-test assumes variances are equal; Welch's t-test assumes variances of the groups are not equal; 
styleA = rnorm(n=27,mean=10500,sd=1200)
styleB = rnorm(n=27,mean=10500+600,sd=1200)
test=t.test(styleA, styleB, var.equal = T)
results[i] = test[["p.value"]] <0.05

  
}

library(tidyverse)
sims = expand.grid(
  test =c("Student","Welch", "MW"),
  rep = 1:10000
  
) |>
mutate(
  filename=paste0(test, "-",rep,".rds")
)  
seed = row_number()


pwalk(
  list(sims[["test"]],
       sims[["filename"]],
       sims[["seed"]]),
  runReplication
  
)

runReplilation = function(test, filename, seed) {
  set.seed(seed)
  
  A=rnorm(27, mean = 10500, sd=1200)
  B=rnorm(27,mean=11000,sd=1200)
  
  if(test == "Student")
    hypothesis_test = t.test(A,B,var.equal=T)
  if(test == "Welch")
    hypothesis_test =t.test(A,B,var.equal=F)
  if(test == "MW")
    hypothesis_test = wilcox.test(A,B)
  
  result = list(
    testname =test,
    result = hypohtesis_test[["p.value"]]<0.05
  )

  }

R basic coding

  • Density plot
theta <- c(1, 2, 3, 4, 5)
plot(density(theta), type="l")